PaCMAP Benchmark#

Load data#

Hide code cell source
import pandas as pd
from source.pacmap_functions import *

input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'

df_methyl = pd.read_pickle(
    input_path+'df_batch_corrected.pkl').sort_index()

df_labels = pd.read_csv(
    input_path+'clinical_data.csv', index_col=0, low_memory=False).sort_index()

df_labels['PaCMAP Output'] = 'Patient Samples'
df_labels['Batch'] = df_methyl['Batch']

print(
f' Dataset (df) contains {df_methyl.shape[1]}\
 columns (5mC nucleotides/probes) and {df_methyl.shape[0]} rows (samples).')

output_notebook()

# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
 Dataset (df) contains 333352columns (5mC nucleotides/probes) and 3330 rows (samples).
Loading BokehJS ...

All samples#

Hide code cell source
clinical_trials = ['NOPHO ALL92-2000', 
                    'AAML0531',
                    'AAML1031',
                    'Beat AML Consortium',
                    'TCGA AML',
                    'CETLAM SMD-09 (MDS-tAML)',
                    'French GRAALL 2003–2005',
                    'TARGET ALL',
                    'AAML03P1',
                    'Japanese AML05',
                    'CCG2961']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
                'Bone Marrow Normal','Primary Blood Derived Cancer - Peripheral Blood',
                'Blood Derived Normal','Likely Diagnosis', 'Control (Healthy Donor)',
                'Relapse','Recurrent Blood Derived Cancer - Bone Marrow',
                'Recurrent Blood Derived Cancer - Peripheral Blood',
                'Peripheral Blood Normal']

cols = ['PaCMAP Output','Pathology Class','WHO 2021 Diagnosis','WHO AML 2021 Diagnosis','WHO ALL 2021 Diagnosis','ELN AML 2022 Diagnosis',
        'Age (group years)', 'Batch', 'Sex', 'Clinical Trial', 'Sample Type', 'Karyotype', 'Gene Fusion', 'Patient_ID']

# processor = DataProcessor(df_labels.copy(), df_methyl, clinical_trials, sample_types, cols)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df = processor.df

# # Save output to avoid re-running the code multiple times
# df.to_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv')

df = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv', index_col=0)

plotter = BokehPlotter(df, cols, get_custom_color_palette(), 
                       title='The Methylome Atlas of Acute Leukemia',
                       x_range=(-50, 50), y_range=(-50, 50),
                       datapoint_size=3)
plotter.plot()

Pediatric diagnostic AML samples#

Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['PaCMAP Output','Pathology Class','WHO AML 2021 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status',
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group',
        'Clinical Trial', 'Vital Status','First Event','Sample Type','Karyotype', 'Gene Fusion', 'Patient_ID']

# processor = DataProcessor(df_labels.copy(), df_methyl, clinical_trials, sample_types, cols)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df2 = processor.df

# # Save output to avoid re-running the code multiple times
# df2.to_csv(output_path+'pacmap_output/pacmap_2d_output_peds_dx_aml.csv')

df2 = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_peds_dx_aml.csv', index_col=0)

plotter = BokehPlotter(df2, cols, get_custom_color_palette(),
                       title='Map of Pediatric AML at Diagnosis',
                        x_range=(-45, 45), y_range=(-45, 45),
                        datapoint_size=4)
plotter.plot()

KMT2A diagnostic pediatric AML samples#

Hide code cell source
clinical_trials = ['AAML0531', 'AAML1031', 'AAML03P1', 'CCG2961', 'Japanese AML05']

sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Bone Marrow Normal',
                'Primary Blood Derived Cancer - Peripheral Blood', 'Blood Derived Normal']

cols = ['PaCMAP Output','Gene Fusion','WHO AML 2021 Diagnosis','ELN AML 2022 Diagnosis', 'FAB', 'FLT3 ITD', 'Age (group years)',
        'Complex Karyotype', 'Primary Cytogenetic Code' ,'Batch', 'Sex', 'MRD 1 Status', 
        'Leucocyte counts (10⁹/L)', 'Risk Group', 'Race or ethnic group','First Event',
        'Clinical Trial', 'Vital Status', 'Sample Type', 'Karyotype', 'Patient_ID']

kmt2a = df_labels[df_labels['ELN AML 2022 Diagnosis'].isin(['AML with t(9;11)(p22;q23.3)/KMT2A-rearrangement'])]

# processor = DataProcessor(kmt2a, df_methyl, clinical_trials, sample_types, cols, remove_duplicates=False)
# processor.filter_data()
# processor.apply_pacmap()
# processor.join_labels()
# df3 = processor.df

# # Save output to avoid re-running the code multiple times
# df3.to_csv(output_path+'pacmap_output/pacmap_2d_output_kmt2a.csv')

df3 = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_kmt2a.csv', index_col=0)

plotter = BokehPlotter(df3, cols, get_custom_color_palette(),
                       title='KMT2A (MLL) Pediatric AML Samples',
                       x_range=(-40, 40), y_range=(-40, 40),
                       datapoint_size=9)
plotter.plot()

Watermark#

Author: Francisco_Marchi@Lamba_Lab_UF

Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.12.2

numpy  : 1.24.3
pandas : 2.0.2
bokeh  : 3.1.1
pacmap : 0.7.0
itables: 1.5.2

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.90.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 20
Architecture: 64bit